1. proper reading of biom table (output: biomtable.txt)
  2. proper distinguishment between categorical and continous variables in mapping file (output: mapping_cleaned_MrOS.txt)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

biom table


In [2]:
# convert biom table to tab delimited file in bash with 'taxonomy' information remained
# biom convert -i reference-hit.tax.biom -o table.from_biom.txt --to-tsv --header-key 'taxonomy'

In [3]:
bt = pd.read_csv('../../data/table.from_biom.txt', sep='\t', index_col='#OTU ID', skiprows=1)

In [4]:
print(bt.shape)
bt.head()


(4727, 600)
Out[4]:
SD8637 PO7016 MN1789 MN1868 PA3814 MN2006 PA3070 PI5374 SD8714 BI0448 ... BI0445 PO7138 BI0215 MN2452 BI0704 PO7442 MN2439 SD8458 PI4831 taxonomy
#OTU ID
TACGTAGGTGGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGTGTAGGCGGGATATCAAGTCAGAAGTGAAAATTACGGGCTCAACTCGTAACCTGCTTTTGAAACTGACATTCTTGAGTGAAGTAGAGGCAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGGCTTTTACTGACGCTGAGGCTCGAAAGCGTGGGGAGCAAA 1819.0 15.0 0.0 195.0 190.0 54.0 0.0 0.0 818.0 177.0 ... 2.0 446.0 0.0 1299.0 112.0 14.0 0.0 229.0 4.0 k__Bacteria; p__Firmicutes; c__Clostridia; o__...
TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGACGGGTCCTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGGGACCTTGAGTGCGGCAGAGGCAGGCGGAATTCGTGGTGTAGCGGTGAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTTGCTGGACCGTAACTGACGTTGATGCTCGAAAGTGCGGGTATCAAA 1656.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 k__Bacteria; p__Bacteroidetes; c__Bacteroidia;...
AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCCATGGGCTCAACCCATGAACTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAGCGGTGGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACCAACTGACGCTGAGGCTCGAAAGTGTGGGTAGCAAA 983.0 249.0 0.0 978.0 236.0 261.0 379.0 0.0 455.0 53.0 ... 0.0 1977.0 242.0 0.0 0.0 0.0 225.0 109.0 0.0 k__Bacteria; p__Firmicutes; c__Clostridia; o__...
TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATGTTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGATATCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGGTGTAGCGGTGAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCCTGCTAAGCTGCAACTGACATTGAGGCTCGAAAGTGTGGGTATCAAA 781.0 0.0 381.0 0.0 1311.0 1012.0 4608.0 3466.0 2953.0 3482.0 ... 0.0 2730.0 685.0 336.0 1030.0 10.0 2373.0 0.0 3.0 k__Bacteria; p__Bacteroidetes; c__Bacteroidia;...
AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCTATGGGCTCAACCCATAAACTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAGCGGTGGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACCAACTGACGCTGAGGCTCGAAAGTGTGGGTAGCAAA 649.0 315.0 0.0 0.0 829.0 1072.0 1125.0 517.0 732.0 0.0 ... 2.0 215.0 735.0 1479.0 521.0 646.0 1508.0 148.0 5910.0 k__Bacteria; p__Firmicutes; c__Clostridia; o__...

5 rows × 600 columns


In [5]:
print(bt.taxonomy.str.len().min())
print(bt.taxonomy.str.len().max())


10
131

In [6]:
bt.to_csv('../data/biomtable.txt', sep='\t')

mapping file


In [7]:
mf = pd.read_csv('../data/mapping_MrOS.txt', sep='\t', dtype=str, index_col='#SampleID')

In [8]:
print(mf.shape)
mf.head()


(599, 64)
Out[8]:
BarcodeSequence LinkerPrimerSequence Experiment_Design_Description Library_Construction_Protocol Linker Platform Center_Name Center_Project Instrument_Model Title ... OHVD3 OHVD2 OHV1D2 OHV1D2CT OHVD2CT OHVDTOT OHV1DTOT OHSEAS VDstatus Description
#SampleID
BI0023 TCTGGTGACATT GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq MrOS_VitaminD ... 25.8 0.0 0.0 1: Yes 1: Yes 25.8 39.3 3:SUMMER sufficiency Orwoll.BI0023.BI
BI0056 CAAGCATGCCTA GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq MrOS_VitaminD ... 39.2 0.0 0.0 1: Yes 1: Yes 39.2 61.9 2:SPRING sufficiency Orwoll.BI0056.BI
BI0131 CTATTTGCGACA GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq MrOS_VitaminD ... 23.1 0.0 0.0 1: Yes 1: Yes 23.1 52.1 2:SPRING sufficiency Orwoll.BI0131.BI
BI0153 ATCGGCGTTACA GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq MrOS_VitaminD ... 27.3 0.0 0.0 1: Yes 1: Yes 27.3 43.1 2:SPRING sufficiency Orwoll.BI0153.BI
BI0215 CCTCTCGTGATC GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina BI MrOS Illumina MiSeq MrOS_VitaminD ... 33 0.0 0.0 1: Yes 1: Yes 33.0 50.2 4:FALL sufficiency Orwoll.BI0215.BI

5 rows × 64 columns


In [9]:
vars_cat = np.array(['BarcodeSequence', 'LinkerPrimerSequence', 'Experiment_Design_Description',
             'Library_Construction_Protocol', 'Linker', 'Platform', 'Center_Name', 'Center_Project', 'Instrument_Model',
             'Title', 'Anonymized_Name', 'Scientific_Name', 'Taxon_ID', 'Sample_Type', 'Geo_Loc_Name', 'Elevation', 'Env_Biome',
             'Env_Feature', 'Env_Material', 'Env_Package', 'Collection_Timestamp', 'DNA_Extracted', 'Physical_Specimen_Location',
             'Physical_Specimen_Remaining', 'Age_Units', 'Host_Subject_ID', 'Host_Taxid','Host_Scientific_Name', 'Host_Common_Name',
             'Life_Stage', 'Sex', 'Height_Units', 'Weight_Units', 'Body_Habitat', 'Body_Site', 'Body_Product', 'GIERACE', 'SITE',
             'TUDRAMT', 'TURSMOKE', 'M1ADEPR', 'M1VITMND', 'M1ANTIB', 'M1PROBI', 'OHSEAS', 'VDstatus', 'Description',
             'OHV1D2CT', 'OHVD2CT'])
vars_cts = np.array(['Latitude', 'Longitude', 'Age', 'Height', 'Weight', 'BMI', 'PASCORE', 'DTVITD', 
             'OHV1D3', 'OHV24D3', 'OHVD3', 'OHVD2', 'OHV1D2',  'OHVDTOT', 'OHV1DTOT'])

In [10]:
# convert vars_cts to numeric and vars_cat to factors
df = mf.copy()
df[vars_cts] = df[vars_cts].apply(pd.to_numeric, errors='coerce')
df[vars_cat] = df[vars_cat].apply(lambda x: x.astype('category'))

In [11]:
# convert all pg/ml to ng/ml note: 1 ng/ml = 1000 pg/ml
df.OHV1D3 = df.OHV1D3/1000
df.OHV1D2 = df.OHV1D2/1000
df.OHV1DTOT = df.OHV1DTOT/1000

In [12]:
#df.M1ANTIB.value_counts()

add two ratio variables of Vitamin D


In [13]:
# df['ratio_activation'] = df.OHV1D3/(df.OHVD3*1000) # pg/ml vs. ng/ml
# df['ratio_catabolism'] = df.OHV24D3/df.OHVD3 # both ng/ml
df['ratio_activation'] = df.OHV1D3/df.OHVD3
df['ratio_catabolism'] = df.OHV24D3/df.OHVD3 
vars_cts = np.append(vars_cts, ['ratio_activation', 'ratio_catabolism'])

In [14]:
df[vars_cts].describe()


Out[14]:
Latitude Longitude Age Height Weight BMI PASCORE DTVITD OHV1D3 OHV24D3 OHVD3 OHVD2 OHV1D2 OHVDTOT OHV1DTOT ratio_activation ratio_catabolism
count 599.000000 599.000000 599.000000 599.000000 599.000000 599.000000 599.000000 584.000000 567.000000 567.000000 556.000000 567.000000 567.000000 556.000000 567.000000 556.000000 556.000000
mean 39.131492 -105.850862 84.237062 172.143573 80.173957 27.009647 122.482829 164.395957 0.057775 3.430864 35.229137 0.769136 0.000177 36.013489 0.057951 0.001772 0.094776
std 5.226215 17.221362 4.061471 6.803620 12.994918 3.788662 66.667650 127.458600 0.019773 1.834771 12.450758 4.266210 0.002232 12.410942 0.019656 0.000735 0.029770
min 32.715738 -122.676500 78.000000 153.850000 51.400000 17.599566 0.000000 0.429000 0.010700 0.300000 7.800000 0.000000 0.000000 7.800000 0.010700 0.000398 0.018788
25% 33.520661 -122.143000 81.000000 167.325000 71.000000 24.506948 70.142857 77.962500 0.044100 2.175000 27.400000 0.000000 0.000000 28.100000 0.044250 0.001316 0.074216
50% 40.440625 -117.161100 83.000000 172.100000 79.000000 26.724552 116.750000 128.125000 0.055500 3.180000 33.650000 0.000000 0.000000 34.150000 0.055500 0.001660 0.092821
75% 44.977753 -86.802490 87.000000 176.725000 87.500000 28.975480 165.785714 211.642500 0.066300 4.235000 41.825000 0.000000 0.000000 42.300000 0.066450 0.002081 0.112849
max 45.523062 -79.995890 98.000000 194.650000 127.600000 42.931509 359.142857 982.830000 0.156000 14.070000 104.000000 59.000000 0.037800 104.000000 0.156000 0.006727 0.197786

In [15]:
df[vars_cat].describe()


Out[15]:
BarcodeSequence LinkerPrimerSequence Experiment_Design_Description Library_Construction_Protocol Linker Platform Center_Name Center_Project Instrument_Model Title ... TURSMOKE M1ADEPR M1VITMND M1ANTIB M1PROBI OHSEAS VDstatus Description OHV1D2CT OHVD2CT
count 599 599 599 599 599 599 599 599 599 599 ... 599 599 599 599 599 599 588 599 599 599
unique 599 1 1 1 1 1 6 1 1 1 ... 4 2 2 2 2 5 3 599 3 3
top TTGTCTGGAAGC GGACTACHVGGGTWTCTAAT 16S stool samples sequenced for MrOS Vitamin D... 16S rRNA v4 GT Illumina SD MrOS Illumina MiSeq MrOS_VitaminD ... 1:PAST 0: No 1: Yes 0: No 0: No 3:SUMMER sufficiency Orwoll.SD9009.SD 1: Yes 1: Yes
freq 1 599 599 599 599 599 134 599 599 599 ... 289 546 445 558 573 220 516 1 563 448

4 rows × 49 columns


In [16]:
df[vars_cts].isnull().sum()


Out[16]:
Latitude             0
Longitude            0
Age                  0
Height               0
Weight               0
BMI                  0
PASCORE              0
DTVITD              15
OHV1D3              32
OHV24D3             32
OHVD3               43
OHVD2               32
OHV1D2              32
OHVDTOT             43
OHV1DTOT            32
ratio_activation    43
ratio_catabolism    43
dtype: int64

In [17]:
# for i in range(len(vars_cat)):
#     print(df[vars_cat[i]].value_counts())

In [18]:
# check
print(mf.shape)
print(df.shape)


(599, 64)
(599, 66)

In [19]:
df.to_csv('../data/mapping_cleaned_MrOS.txt', sep= '\t', index=True)

In [ ]: